{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T09:24:11.796220Z",
     "start_time": "2019-08-23T09:24:11.792233Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import gc\n",
    "from sklearn.model_selection import KFold,StratifiedKFold\n",
    "import lightgbm as lgb\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T09:24:13.818364Z",
     "start_time": "2019-08-23T09:24:13.508770Z"
    }
   },
   "outputs": [],
   "source": [
    "train_data = pd.read_csv('data/age_train.csv', header=None)\n",
    "test_data = pd.read_csv('data/age_test.csv', header=None)\n",
    "train_data.columns = ['uId', 'age_group']\n",
    "test_data.columns = ['uId']\n",
    "data_id = pd.concat([train_data, test_data], axis=0, sort=False)['uId']\n",
    "data_id = data_id.reset_index()\n",
    "del data_id['index']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T09:25:21.148602Z",
     "start_time": "2019-08-23T09:24:29.080553Z"
    }
   },
   "outputs": [],
   "source": [
    "f1 = pd.read_csv('lgb_feature/f1.csv')\n",
    "f2 = pd.read_csv('lgb_feature/f2.csv')\n",
    "f3 = pd.read_csv('lgb_feature/f3.csv')\n",
    "f4 = pd.read_csv('lgb_feature/f4.csv')\n",
    "f5 = pd.read_csv('lgb_feature/f5.csv')\n",
    "f6 = pd.read_csv('lgb_feature/f6.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T09:25:21.468264Z",
     "start_time": "2019-08-23T09:25:21.150245Z"
    }
   },
   "outputs": [],
   "source": [
    "# train_lgb_snake = pd.read_hdf('lgb_feature/lgb_633_proba.hdf', key='train')\n",
    "# test_lgb_snake = pd.read_hdf('lgb_feature/lgb_633_proba.hdf', key='test')\n",
    "# lgb_snake = pd.concat([train_lgb_snake, test_lgb_snake], sort=False, axis=0)\n",
    "# del lgb_snake['age_group']\n",
    "# lgb_snake.columns = ['snake1', 'snake2', 'snake3', 'snake4', 'snake5', 'snake6']\n",
    "# lgb_snake = lgb_snake.reset_index()\n",
    "# del lgb_snake['uid']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T09:25:21.682719Z",
     "start_time": "2019-08-23T09:25:21.470418Z"
    }
   },
   "outputs": [],
   "source": [
    "# train_lgb_snake = pd.read_hdf('lgb_feature/ctr_649.hdf', key='train')\n",
    "# test_lgb_snake = pd.read_hdf('lgb_feature/ctr_649.hdf', key='test')\n",
    "# ctr_snake = pd.concat([train_lgb_snake, test_lgb_snake], sort=False, axis=0)\n",
    "# ctr_snake.columns = ['ctr_snake1', 'ctr_snake2', 'ctr_snake3', 'ctr_snake4',\n",
    "#                                  'ctr_snake5', 'ctr_snake6']\n",
    "# ctr_snake = ctr_snake.reset_index()\n",
    "# del ctr_snake['index']\n",
    "# ctr_snake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T09:25:27.622338Z",
     "start_time": "2019-08-23T09:25:21.684383Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  # Remove the CWD from sys.path while we load stuff.\n"
     ]
    }
   ],
   "source": [
    "feature = pd.concat([\n",
    "                    f1, f2, f3,\n",
    "                    f4, f5, f6,\n",
    "#     f7,\n",
    "#                     nn_1, nn_2, nn_3,\n",
    "#                     lgb_snake, ctr_snake\n",
    "#                     snake_1\n",
    "], axis=1, sort=False)\n",
    "train_feature = feature[:len(train_data)]\n",
    "train_feature['label'] = train_data['age_group'] - 1\n",
    "test_feature = feature[len(train_data):]\n",
    "\n",
    "not_cols = ['label']\n",
    "cols = [col for col in train_feature.columns if col not in not_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T09:25:27.634786Z",
     "start_time": "2019-08-23T09:25:27.623954Z"
    }
   },
   "outputs": [],
   "source": [
    "def evaluate_5_fold(train_df, test_df, cols, test=False):\n",
    "    kf = KFold(n_splits=5, shuffle=True, random_state=1017)\n",
    "    y_test = 0\n",
    "    oof_train = np.zeros((train_df.shape[0], 6))\n",
    "    for i, (train_index, val_index) in enumerate(kf.split(train_df[cols],train_df.label)):\n",
    "        X_train, y_train = train_df.loc[train_index, cols], train_df.label.values[train_index]\n",
    "        X_val, y_val = train_df.loc[val_index, cols], train_df.label.values[val_index]\n",
    "\n",
    "        lgb_train = lgb.Dataset(\n",
    "            X_train, y_train)\n",
    "        lgb_eval = lgb.Dataset(\n",
    "            X_val, y_val,\n",
    "            reference=lgb_train)\n",
    "        params = {\n",
    "            'boosting_type': 'gbdt',\n",
    "            'learning_rate' : 0.04, \n",
    "            'verbose': 0,\n",
    "            'num_leaves':256,\n",
    "            # 'max_depth':8, \n",
    "            # 'max_bin':10, \n",
    "            # 'lambda_l2': 1, \n",
    "            'min_child_weight':30,\n",
    "            \"num_class\":6,\n",
    "            'objective':'multiclass', \n",
    "            'feature_fraction':0.4,\n",
    "            'bagging_fraction':0.7, # 0.9是目前最优的\n",
    "            'bagging_freq':5,  # 3是目前最优的\n",
    "#           'min_data': 500,\n",
    "            'seed': 1017,\n",
    "            'metric':'multi_error',\n",
    "            'nthread': 50,\n",
    "            # 'silent': True,\n",
    "        }\n",
    "\n",
    "        gbm = lgb.train(params,\n",
    "                        lgb_train,\n",
    "                        num_boost_round=40000,\n",
    "                        valid_sets=lgb_eval,\n",
    "                        early_stopping_rounds=100,\n",
    "                        verbose_eval=100,\n",
    "                        )\n",
    "        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)\n",
    "        if test:\n",
    "            y_test += gbm.predict(test_df[cols], num_iteration=gbm.best_iteration)\n",
    "        oof_train[val_index] = y_pred\n",
    "        break\n",
    "    auc = accuracy_score(train_df.label.values, np.argmax(oof_train, axis=1))\n",
    "    y_test /= 5\n",
    "    feature_list = pd.DataFrame()\n",
    "    feature_list['names'] = cols\n",
    "    feature_list['imp'] = gbm.feature_importance()\n",
    "    feature_list = feature_list.sort_values(by='imp', ascending=False)\n",
    "    print(feature_list)\n",
    "    print('5 Fold auc:', auc)\n",
    "    gc.collect()\n",
    "    return auc, oof_train, y_test, feature_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2019-08-23T09:25:01.009Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 100 rounds.\n",
      "[100]\tvalid_0's multi_error: 0.39606\n",
      "[200]\tvalid_0's multi_error: 0.388186\n",
      "[300]\tvalid_0's multi_error: 0.384359\n",
      "[400]\tvalid_0's multi_error: 0.38241\n",
      "[500]\tvalid_0's multi_error: 0.381095\n",
      "[600]\tvalid_0's multi_error: 0.380135\n",
      "[700]\tvalid_0's multi_error: 0.379585\n",
      "[800]\tvalid_0's multi_error: 0.379226\n",
      "[900]\tvalid_0's multi_error: 0.378865\n",
      "[1000]\tvalid_0's multi_error: 0.378688\n",
      "[1100]\tvalid_0's multi_error: 0.378451\n",
      "[1200]\tvalid_0's multi_error: 0.378296\n",
      "[1300]\tvalid_0's multi_error: 0.378252\n",
      "[1400]\tvalid_0's multi_error: 0.3781\n",
      "[1500]\tvalid_0's multi_error: 0.378028\n",
      "Early stopping, best iteration is:\n",
      "[1432]\tvalid_0's multi_error: 0.377975\n",
      "              names    imp\n",
      "72    romLeftRation  39549\n",
      "106  date_std_times  35945\n",
      "102             app  34946\n",
      "88      app_len_std  33663\n",
      "93         date_std  33047\n",
      "..              ...    ...\n",
      "81          carrier   5364\n",
      "82             os_1   4750\n",
      "66       FFuncTimes   2744\n",
      "83             os_2   2375\n",
      "78     color_length   2307\n",
      "\n",
      "[117 rows x 2 columns]\n",
      "5 Fold auc: 0.22446225\n"
     ]
    }
   ],
   "source": [
    "auc, oof_train, y_test, imp = evaluate_5_fold(train_feature, test_feature, cols, True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T02:54:45.264950Z",
     "start_time": "2019-08-23T02:54:27.211161Z"
    }
   },
   "outputs": [],
   "source": [
    "stack_lgb_train = pd.DataFrame(oof_train)\n",
    "stack_lgb_test = pd.DataFrame(y_test)\n",
    "stack_lgb = pd.DataFrame(np.around(pd.concat([stack_lgb_train, stack_lgb_test], axis=0, sort=False), 6))\n",
    "stack_lgb.to_csv('result/lgb_stack_feature.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-23T02:54:46.713141Z",
     "start_time": "2019-08-23T02:54:45.266907Z"
    }
   },
   "outputs": [],
   "source": [
    "result = np.argmax(y_test, axis=1) + 1\n",
    "put_result = pd.DataFrame()\n",
    "put_result['id'] = test_data['uId']\n",
    "put_result['label'] = result\n",
    "put_result.to_csv('result/submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-15T14:00:20.865538Z",
     "start_time": "2019-08-15T14:00:19.932546Z"
    }
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
